/*******************************************************************************

Ryan Hill: ryan.hill@kellogg.northwestern.edu
Carolyn Stein: carolyn_stein@berkeley.edu
Last modified: November 2024

Inputs: 	pdb_full_structures.dta
			pdb_full_papers.dta
			pdb_analysis.dta
			phat.dta
			
Outputs:	Figures 9, 12

Purpose: 	Test whether high-Q structures more likely to be involved in drugs
			Compare initial quality, best quality, SG counterfactual quality

*******************************************************************************/


* Assemble data, keeping SG researchers
clear all
use "${data_built}pdb_full_structures.dta"
merge 1:1 structureId using "${data_built}pdb_analysis.dta"
assert _merge != 2
keep if _merge == 3
drop _merge

merge m:1 pubmedId using "${data_built}pdb_full_papers.dta"
keep if _merge != 2
drop _merge

merge 1:1 structureId using "${data_built}phat.dta"
assert _merge == 3
drop _merge

* Generate interaction
generate nonSG = (structuralGenomics == 0)
gen interaction = predPtileCites3 * nonSG


* Compute the standardized best measures, using the same mean and SD that we use 	
* to standardize the regular quality measures, so magnitudes are comparable

* Note: not doing this the comp_qual_new_variables file because it requires 
* defining the analysis sample

* Shorten the resolution variable names for ease
rename refinementResolution resolution
rename refinementResolutionBest resolutionBest
	
* Standardize the "best" measures
foreach Q in resolution rFree ramaRaw {
	sum `Q'
	local mean = r(mean)
	local sd = r(sd)
	gen `Q'BestStd = -(`Q'Best - `mean')/`sd'
}
	
* Generate an index by summing the standardized best measures
gen indexBest = resolutionBestStd + rFreeBestStd + ramaRawBestStd
	
* Compute the mean and SD of the original index
gen index = resolutionStd + rFreeStd + ramaRawStd
sum index
local mean = r(mean)
local sd = r(sd)

* Standardize the best index using mean and SD of the original index
gen indexBestStd = (indexBest - `mean')/`sd'

/*------------------------------------------------------------------------------

	Figure 9: Relationship between Structure Quality and Drug Development

------------------------------------------------------------------------------*/

* Drug development vs. resolution binscatter
binscatter drugbank_all_count resolutionBest, xline(2.5) linetype(none) 	///
	xtitle("Best refinement resolution" "(Lower is better)") 				///
	ytitle("Number of drugs targeting structure") mcolor(navy%30)			///
	title("Panel A: Refinement Resolution") name(resolution, replace) 

* Drug development vs. R-free binscatter
binscatter drugbank_all_count rFreeBest, xline(.25) linetype(none) 			///
		   xtitle("Best R-free" "(Lower is better)") 						///
		   ytitle("Number of drugs targeting structure") 					///
		   title("Panel B: R-free") name(rfree, replace) mcolor(navy%30)
		   
graph combine resolution rfree, xsize(3) ysize(1) scale(1.75)

graph save "${figures}figure9.gph", replace
graph export "${figures}figure9.pdf", replace 

	
/*------------------------------------------------------------------------------

	Figure 12: Subsequent Structure Deposits and Quality Improvement

------------------------------------------------------------------------------*/

* Impute the counterfactual quality in non-SG researchers behaved like SG
	
* Loop over each quality outcome
foreach y in resolutionStd rFreeStd ramaRawStd indexStd {
	 
	* Run the interacted regression 
	reg `y' predPtileCites3 nonSG interaction $time_controls, robust
		
	* Predict the values if structure had been SG
	gen `y'SG = `y' - _b[nonSG]*nonSG - _b[interaction]*interaction
		
	assert `y'SG == `y' if nonSG == 0
		
}

* Binscatter of P vs. Q, grab relevant parts of the stored command
binscatter indexStd predPtileCites3 if structuralGenomics == 0, 			///
control($time_controls)
	* Access the binscatter graph command and save the relevant parts as locals
	local temp1 = e(graphcmd)
	tokenize "`temp1'", parse("(")
	local temp2 = "`3'"
	local temp3 = "`9'"
	local temp4 = "`11'"
	tokenize "`temp2'", parse(",")
	local scatter1 = "`1'"
	tokenize "`temp3'", parse(",")
	local line1 = "`1'"
	tokenize "`temp4'", parse(")")
	local range1 = "`1'"

* Binscatter of P vs. Q_counterfactual, grab relevant parts of the stored command
binscatter indexStdSG predPtileCites3 if structuralGenomics == 0, 			///
control($time_controls)
	* Access the binscatter graph command and save the relevant parts as locals
	local temp1 = e(graphcmd)
	tokenize "`temp1'", parse("(")
	local temp2 = "`3'"
	local temp3 = "`9'"
	local temp4 = "`11'"
	tokenize "`temp2'", parse(",")
	local scatter2 = "`1'"
	tokenize "`temp3'", parse(",")
	local line2 = "`1'"
	tokenize "`temp4'", parse(")")
	local range2 = "`1'"
	
* Binscatter of P vs. Q_best, grab relevant parts of the stored command
binscatter indexBestStd predPtileCites3 if structuralGenomics == 0, 		///
control($time_controls)
	* Access the binscatter graph command and save the relevant parts as locals
	local temp1 = e(graphcmd)
	tokenize "`temp1'", parse("(")
	local temp2 = "`3'"
	local temp3 = "`9'"
	local temp4 = "`11'"
	tokenize "`temp2'", parse(",")
	local scatter3 = "`1'"
	tokenize "`temp3'", parse(",")
	local line3 = "`1'"
	tokenize "`temp4'", parse(")")
	local range3 = "`1'"
	
* Combine these three binscatters
 twoway (`scatter1', mcolor(navy%30)) 										///
	(`line1', range(`range1') lcolor(navy)) 								///
	(`scatter2', mcolor(maroon%30) msymbol(D)) 								///
	(`line2', range(`range2') lcolor(maroon)) 								///
	(`scatter3', mcolor(green%30) msymbol(T)) 								///
	(`line3', range(`range3') lcolor(green)), 								///
	xtitle("Potential" "Predicted three-year citation percentile") 			///
	ytitle("Standardized quality index") 									///
	xlab(20(20)80) ylab(-0.5(0.25)0.5) 										///
	legend(order(1 "Initial structure" 3 "Counterfactual structure" 5 		///
	"Best structure") position(6) row(1)) xsize(3) ysize(2)
	
graph save "${figures}figure12.gph", replace
graph export "${figures}figure12.pdf", replace 

